library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(xgboost)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.5
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::margin() masks randomForest::margin()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ dplyr::slice() masks xgboost::slice()
## ✖ lubridate::union() masks base::union()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(segmented)
source('functions.r')
load("Table_construction.Rdata")
features = features %>%
# Add other useful information:
inner_join(
data_before %>%
select(person_id, screening_date, people) %>%
unnest() %>%
select(person_id, screening_date, race, sex, name),
by = c("person_id","screening_date")
) %>%
inner_join(features_on, by = c("person_id","screening_date")) %>%
inner_join(outcomes, by = c("person_id","screening_date")) %>%
#duplicate columns
select(-c(offenses_within_30.y)) %>%
# Create as many features as possible:
mutate(
raw_score = `Risk of Violence_raw_score`, # Adjust for violent/general
decile_score = `Risk of Violence_decile_score`, # Adjust for violent/general
p_juv_fel_count = pmin(p_juv_fel_count,2),
p_felprop_violarrest = pmin(p_felprop_violarrest,5),
p_murder_arrest = pmin(p_murder_arrest,3),
p_felassault_arrest = pmin(p_felassault_arrest,3),
p_misdemassault_arrest = pmin(p_misdemassault_arrest,3),
#p_famviol_arrest = pmin(p_famviol_arrest,3),
p_sex_arrest = pmin(p_sex_arrest,3),
p_weapons_arrest = pmin(p_weapons_arrest,3),
p_n_on_probation = pmin(p_n_on_probation,5),
p_current_on_probation = pmin(p_current_on_probation,5),
p_prob_revoke = pmin(p_prob_revoke,5),
race_black = if_else(race=="African-American",1,0),
race_white = if_else(race=="Caucasian",1,0),
race_hispanic = if_else(race=="Hispanic",1,0),
race_asian = if_else(race=="Asian",1,0),
race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
# Subscales:
vio_hist = p_juv_fel_count+
p_felprop_violarrest+
p_murder_arrest+
p_felassault_arrest+
p_misdemassault_arrest+
#p_famviol_arrest+ #because no observations have nonzero for this
p_sex_arrest+
p_weapons_arrest,
history_noncomp = p_prob_revoke+
p_probation+p_current_on_probation+
p_n_on_probation,
# Filters (TRUE for obserations to keep)
filt1 = `Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1,
filt2 = offenses_within_30.x == 1,
filt3 = !is.na(current_offense_date),
filt4 = current_offense_date <= current_offense_date_limit,
filt5 = p_current_age > 18 & p_current_age <= 65,
filt6 = vio_hist == 0 & history_noncomp==0
)
Risk of Violence Modelled age with a 4th degree polynomial. See broward_sheriff_data for our efforts to model it with a variety of exponentials (couldn’t get it quite right so used a polynomial in the end). ## Fit age polynomial
features_age_poly = features %>%
filter(filt1,filt5, filt6)
lb_age = features_age_poly %>%
group_by(p_current_age) %>%
arrange(raw_score) %>%
top_n(n=-1, wt=raw_score) # Fit lower bound on smallest value
mdl_age = lm(raw_score ~
I(p_current_age^4) +
I(p_current_age^3) +
I(p_current_age^2) +
p_current_age,
data=lb_age)
# More precision for paper
summary(mdl_age)
##
## Call:
## lm(formula = raw_score ~ I(p_current_age^4) + I(p_current_age^3) +
## I(p_current_age^2) + p_current_age, data = lb_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57803 -0.00688 0.00225 0.03572 0.13691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.377e+00 1.007e+00 1.368 0.17505
## I(p_current_age^4) 4.045e-07 4.368e-07 0.926 0.35718
## I(p_current_age^3) -8.495e-05 7.258e-05 -1.170 0.24529
## I(p_current_age^2) 7.052e-03 4.347e-03 1.622 0.10861
## p_current_age -3.010e-01 1.106e-01 -2.721 0.00798 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08883 on 81 degrees of freedom
## Multiple R-squared: 0.9793, Adjusted R-squared: 0.9783
## F-statistic: 957.8 on 4 and 81 DF, p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "1.37748225676627056302e+00" "4.04470595978981737175e-07"
## [3] "-8.49491450689363584601e-05" "7.05181080242555605869e-03"
## [5] "-3.01011912451752128295e-01"
## Add f(age) to features and add filter
features = features %>%
mutate(
age_poly = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
raw_score__age_poly = raw_score - age_poly,
filt7 = raw_score >= age_poly - 0.05
)
## Add same columns as above to lb_age
lb_age = lb_age %>%
mutate(
age_poly = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
raw_score__age_poly = raw_score - age_poly,
filt7 = raw_score >= age_poly - 0.05
)
Plotting settings for age analysis
xmin = 18
xmax = 65
xx = seq(xmin,xmax, length.out = 1000)
Generate a preliminary plot of age vs COMPAS violence score
ggplot()+
geom_point(aes(x=p_current_age, raw_score, color = factor(filt7)),alpha=.3, data=lb_age) +
geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("Violent score") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
ggplot()+
geom_point(aes(x=p_current_age, raw_score), color="#619CFF",alpha=.3, data=features_age_poly) +
geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("Violent score") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
#Filter out outliers under the age polynomial, individuals with
#noncomp history/violence history != 0
features_age_poly2 = features %>%
filter(filt1, filt5, filt6, filt7)
# filter(p_age_first_offense == p_current_age)
lb_filt = features_age_poly2 %>%
group_by(p_current_age) %>%
arrange(raw_score)%>%
top_n(n=-1, wt = raw_score)
lb_values = lb_filt %>%
select(p_current_age, raw_score)%>%
group_by(p_current_age) %>%
summarize(raw_score = unique(raw_score),
n_inds = n()) %>%
arrange(p_current_age)
#filtered out points
lb_outliers = rbind(
#reason not included in lb_filt was
#age not above age poly
setdiff(lb_age, lb_filt) %>%
filter(!filt7) %>% #below age poly
mutate(reason = "Below age polynomial")
) %>%
select(p_current_age, p_age_first_offense, raw_score, reason)
lb = lb_filt %>%
select(p_current_age, p_age_first_offense, raw_score) %>%
mutate(reason = "Used to construct linear splines") %>%
rbind(lb_outliers)
Generating new age spline.
mdl_age_spline <- segmented(lm(raw_score ~ p_current_age, data = lb_filt),
seg.Z = ~p_current_age, psi = list(p_current_age = c(22,36,44)),
control = seg.control(display = FALSE)
)
#Add Filter 9
features = features %>%
mutate(
age_spline = predict(mdl_age_spline, newdata=data.frame(p_current_age=p_current_age)),
raw_score__age_spline = raw_score - age_spline,
filt8 = raw_score >= age_spline - 0.05
)
Plottng individuals on new bottom edge produced by fitting to lb_filt individuals.
break1 = summary.segmented(mdl_age_spline)$psi[1,2]
break2 = summary.segmented(mdl_age_spline)$psi[2,2]
break3 = summary.segmented(mdl_age_spline)$psi[3,2]
xx1 = seq(xmin,break1, length.out=500)
xx2 = seq(break1,break2, length.out=500)
xx3 = seq(break2,break3, length.out=500)
xx4 = seq(break3,xmax, length.out=500)
age_spline_viol =
ggplot()+
geom_point(aes(x=p_current_age, raw_score, color= factor(reason)),alpha=.5, data=lb ) +
scale_color_manual(values=c("red", "grey25")) +
geom_line(aes(x=xx1, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx1))),
color="#F8766D", size = 1) +
geom_line(aes(x=xx2, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx2))),
color="darkorange1", size = 1) +
geom_line(aes(x=xx3, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx3))),
color="cyan3", size = 1) +
geom_line(aes(x=xx4, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx4))),
color="seagreen3", size = 1) +
theme_bw()+
xlim(xmin,xmax)+
labs(
x = "\n Age at COMPAS screening date",
y = "Violence Score \n"
) +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position = c(.95,.95),
legend.title = element_blank(),
# legend.key.width = unit(1.5, "cm"),
legend.justification = c("right", "top"),
legend.key = element_rect(fill = "aliceblue"),
legend.background = element_rect(fill="aliceblue",
size=0.5, linetype="solid")
)
age_spline_viol
ggsave("Figures/age_LB_violent.pdf",width = 3.5, height = 2.5, units = "in")
# Age spline with all data (filters 1,5 still applied)
# Red points are below the age polynomial not age spline
ggplot()+
geom_point(aes(x=p_current_age, raw_score), color="#F8766D", data=features %>% filter(filt1,filt5,!filt7)) + # Age outliers
geom_point(aes(x=p_current_age, raw_score), color="#619CFF", alpha=.3, data=features %>% filter(filt1,filt5,filt7)) + # Not age outliers
geom_line(aes(x=xx, predict(mdl_age_spline, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("General score") +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top")
ggsave("Figures/age_LB_alldata_violent.pdf",width = 3.5, height = 2.5, units = "in")
### Compute lower bound of raw_score__age_spline for each vio_hist value:
# Apply filters:
features_vio_hist = features %>%
filter(filt1,filt3) %>% # Careful, filter 6 applied later since we need these points for plot
select(vio_hist, raw_score__age_spline,filt8)
# Compute lower bound
lb_vio_hist = features_vio_hist %>%
filter(filt8) %>% # Now apply filter 6
select(-filt8) %>%
group_by(vio_hist)%>%
top_n(n=-1,wt=raw_score__age_spline)%>%
rename(g_vio_hist = raw_score__age_spline) %>%
#arrange(vio_hist,.by_group=TRUE) %>%
arrange(vio_hist) %>%
ungroup()
# Use last value of g_vio_hist if vio_hist > vio_hist_cutoff
vio_hist_cutoff = 6
lb_vio_hist_cutoff = lb_vio_hist$g_vio_hist[lb_vio_hist$vio_hist==vio_hist_cutoff]
lb_vio_hist = lb_vio_hist %>%
mutate(g_vio_hist = if_else(vio_hist < vio_hist_cutoff, g_vio_hist, lb_vio_hist_cutoff))
# Add g(vio_hist) to features
features = features %>%
left_join(lb_vio_hist, by="vio_hist") %>%
mutate(raw_score__age_spline__g_vio_hist = raw_score__age_spline - g_vio_hist)
lb_vio_hist_plot =
data.frame(vio_hist_xx = seq(0,13,length.out=10000)) %>%
mutate(vio_hist = round(vio_hist_xx)) %>%
left_join(lb_vio_hist,by="vio_hist")
ggplot() +
geom_point(data=features_vio_hist,aes(x=vio_hist,y=raw_score__age_spline,color=filt8),alpha=.5) +
geom_line(data=lb_vio_hist_plot,aes(x=vio_hist_xx,y=g_vio_hist,color="lb"))+
theme_bw()+
xlab("Sum of History of Violence Components") +
ylab(expression(Violent~score~-~f[viol~age])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE","lb"),
labels=c(expression(Above~f[viol~age]), expression(Below~f[viol~age]),expression(g[viol~hist])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38","lb"="#F8766D"))
rm(lb_vio_hist_plot)
ggsave("Figures/vio_hist_LB_violent.pdf",width = 5, height = 3, units = "in")
features %>%
filter(filt1,filt3) %>%
select(history_noncomp, raw_score__age_spline__g_vio_hist, filt8) %>%
ggplot() +
geom_point(aes(x=history_noncomp,y=raw_score__age_spline__g_vio_hist,color=filt8),alpha=.5) +
theme_bw()+
xlab("Sum of History of Noncompliance Components") +
ylab(expression(Violent~score~-~f[viol~age]~-~g[viol~hist])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[viol~age]), expression(Below~f[viol~age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
ggsave("Figures/hist_noncomp_LB_violent.pdf",width = 5, height = 3, units = "in")
There are a few groups of predictors we will use: * Group 1: without using age variables or race * Group 2: without using age variables but with race * Group 3: without using race but with age variables * Group 4: using age variables and race
#### Generic stuff (applies to all models)
## Filter rows
features_filt = features %>%
filter(filt1, filt3)
## Set parameters (each combination will be run)
param <- list(objective = "reg:linear",
eval_metric = "rmse",
eta = c(.05,.1),
gamma = c(.5, 1),
max_depth = c(2,5),
min_child_weight = c(5,10),
subsample = c(1),
colsample_bytree = c(1)
)
# svm
param_svm = list(
type = 'eps-regression',
cost = c(0.5,1,2),
epsilon = c(0.5,1,1.5),
gamma_scale = c(0.5,1,2)
)
res_rmse = data.frame(Group = 1:4, lm = NA, xgb = NA, rf = NA, svm = NA)
### Create group 1 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0423 -0.3256 -0.1199 0.2067 3.7693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.635535 0.009108 69.776 < 2e-16 ***
## p_age_first_offense -0.002209 0.000287 -7.696 1.46e-14 ***
## p_juv_fel_count 0.128035 0.018591 6.887 5.86e-12 ***
## p_felprop_violarrest 0.117539 0.005655 20.785 < 2e-16 ***
## p_murder_arrest 0.117353 0.045875 2.558 0.0105 *
## p_felassault_arrest 0.190687 0.010863 17.554 < 2e-16 ***
## p_misdemassault_arrest 0.183398 0.010030 18.285 < 2e-16 ***
## p_sex_arrest 0.182693 0.038842 4.703 2.57e-06 ***
## p_weapons_arrest 0.111707 0.014999 7.448 9.85e-14 ***
## p_n_on_probation 0.099579 0.004048 24.598 < 2e-16 ***
## p_current_on_probation 0.155303 0.019293 8.050 8.73e-16 ***
## p_prob_revoke 0.201229 0.015975 12.596 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4807 on 21296 degrees of freedom
## Multiple R-squared: 0.2281, Adjusted R-squared: 0.2277
## F-statistic: 572.2 on 11 and 21296 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(46)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("Violent score - f(age)") +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(55656)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 10
## type "eps-regression"
## cost "0.5"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.08333333"
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0551 -0.3131 -0.1100 0.2026 3.6229
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4133605 0.0155902 26.514 < 2e-16 ***
## p_age_first_offense -0.0003337 0.0002917 -1.144 0.25263
## p_juv_fel_count 0.1186751 0.0182569 6.500 8.19e-11 ***
## p_felprop_violarrest 0.1136343 0.0055519 20.468 < 2e-16 ***
## p_murder_arrest 0.1206917 0.0450288 2.680 0.00736 **
## p_felassault_arrest 0.1862307 0.0106634 17.465 < 2e-16 ***
## p_misdemassault_arrest 0.1829102 0.0098450 18.579 < 2e-16 ***
## p_sex_arrest 0.1739793 0.0381234 4.564 5.06e-06 ***
## p_weapons_arrest 0.1016765 0.0147282 6.904 5.22e-12 ***
## p_n_on_probation 0.0942367 0.0039778 23.691 < 2e-16 ***
## p_current_on_probation 0.1483990 0.0189419 7.834 4.93e-15 ***
## p_prob_revoke 0.1936704 0.0156821 12.350 < 2e-16 ***
## race_black 0.2672590 0.0136012 19.650 < 2e-16 ***
## race_white 0.1243048 0.0136794 9.087 < 2e-16 ***
## race_hispanic 0.0326662 0.0162508 2.010 0.04443 *
## race_asian -0.0631597 0.0447106 -1.413 0.15778
## race_native 0.1526455 0.0603705 2.528 0.01146 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4718 on 21291 degrees of freedom
## Multiple R-squared: 0.2567, Adjusted R-squared: 0.2561
## F-statistic: 459.5 on 16 and 21291 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(390)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("Violent score - f(age)") +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(2728)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 2
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "0.5"
## gamma "0.02941176"
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9315 -0.3150 -0.1150 0.1986 3.7819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5544851 0.0094229 58.845 < 2e-16 ***
## p_current_age 0.0184993 0.0006723 27.517 < 2e-16 ***
## p_age_first_offense -0.0194415 0.0006868 -28.306 < 2e-16 ***
## p_juv_fel_count 0.1165548 0.0182743 6.378 1.83e-10 ***
## p_felprop_violarrest 0.0893107 0.0056510 15.804 < 2e-16 ***
## p_murder_arrest 0.0880189 0.0450946 1.952 0.05097 .
## p_felassault_arrest 0.1478925 0.0107875 13.710 < 2e-16 ***
## p_misdemassault_arrest 0.1631086 0.0098843 16.502 < 2e-16 ***
## p_sex_arrest 0.1136560 0.0382527 2.971 0.00297 **
## p_weapons_arrest 0.0764478 0.0147949 5.167 2.40e-07 ***
## p_n_on_probation 0.0819168 0.0040296 20.329 < 2e-16 ***
## p_current_on_probation 0.1630972 0.0189614 8.602 < 2e-16 ***
## p_prob_revoke 0.0790314 0.0163148 4.844 1.28e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4724 on 21295 degrees of freedom
## Multiple R-squared: 0.2546, Adjusted R-squared: 0.2542
## F-statistic: 606.3 on 12 and 21295 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(34)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(Violent~score~-~f[viol~age])) +
ylab("XGBoost prediction") +
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw-fage_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(7872)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 3
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "0.5"
## gamma "0.03846154"
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 4 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9482 -0.3031 -0.1082 0.1942 3.6359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3466613 0.0155250 22.329 < 2e-16 ***
## p_current_age 0.0179295 0.0006617 27.094 < 2e-16 ***
## p_age_first_offense -0.0170374 0.0006799 -25.057 < 2e-16 ***
## p_juv_fel_count 0.1076304 0.0179551 5.994 2.08e-09 ***
## p_felprop_violarrest 0.0864735 0.0055500 15.581 < 2e-16 ***
## p_murder_arrest 0.0912311 0.0442864 2.060 0.03941 *
## p_felassault_arrest 0.1449762 0.0105944 13.684 < 2e-16 ***
## p_misdemassault_arrest 0.1631535 0.0097072 16.808 < 2e-16 ***
## p_sex_arrest 0.1073541 0.0375641 2.858 0.00427 **
## p_weapons_arrest 0.0674220 0.0145361 4.638 3.53e-06 ***
## p_n_on_probation 0.0772093 0.0039612 19.491 < 2e-16 ***
## p_current_on_probation 0.1559156 0.0186261 8.371 < 2e-16 ***
## p_prob_revoke 0.0757706 0.0160212 4.729 2.27e-06 ***
## race_black 0.2547357 0.0133809 19.037 < 2e-16 ***
## race_white 0.1076170 0.0134639 7.993 1.38e-15 ***
## race_hispanic 0.0327969 0.0159781 2.053 0.04012 *
## race_asian -0.0510765 0.0439625 -1.162 0.24532
## race_native 0.1284837 0.0593640 2.164 0.03045 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4639 on 21290 degrees of freedom
## Multiple R-squared: 0.2815, Adjusted R-squared: 0.2809
## F-statistic: 490.6 on 17 and 21290 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(11)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(Violent~score~-~f[viol~age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12))
ggsave("Figures/raw-fage_xgboost_gp4_violent.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
highlight = data.frame(
person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
highlight = TRUE
)
df_plot = features_filt %>%
bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
left_join(highlight, by = c("person_id","screening_date")) %>%
mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))
person_id_text_topright = c(8491, 8375, 1497)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(5722, 11231)
person_id_text_botright = c(799, 11312, 1284, 11414)
person_id_text_botleft = c()
ggplot() +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), data = filter(df_plot, highlight=="In Table 5")) +
theme_bw()+
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) +
xlab(expression(XGBoost~pred.~of~violent~score~-~f[age])) +
ylab("Violent score")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12),
#legend.position = "top",
legend.position="none") +
scale_color_discrete(name = element_blank())
ggsave("Figures/xgboost_rawScore_annotate_violent.pdf",width = 3, height = 3, units = "in")
set.seed(379)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 12
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.05555556"
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
knitr::kable(res_rmse)
| Group | lm | xgb | rf | svm |
|---|---|---|---|---|
| 1 | 0.4805623 | 0.4644645 | 0.4727648 | 0.4712996 |
| 2 | 0.4715878 | 0.4532862 | 0.4609155 | 0.4626015 |
| 3 | 0.4722399 | 0.4503872 | 0.4605618 | 0.4618827 |
| 4 | 0.4636618 | 0.4394210 | 0.4470148 | 0.4475470 |
We use the “Group 3” models but predict the raw score and the raw score minus all of the lower bounds we have fitted. Results from this section can be combined with Group 3 xgboost results above where we predicted the raw score minus the age lower bound only.
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
raw_score)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score) %>% as.matrix(),
"label" = train %>% select(raw_score) %>% as.matrix()
)
set.seed(347)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score
print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4506"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("Violent score") +
ylab("XGBoost prediction")+
#annotate("text", x = -3.5, y = 0.5, label = paste("RMSE:",round(rmse(pred, actual),4)))+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_juv_fel_count,
p_felprop_violarrest,
p_murder_arrest,
p_felassault_arrest,
p_misdemassault_arrest,
#p_famviol_arrest,
p_sex_arrest,
p_weapons_arrest,
p_n_on_probation,
p_current_on_probation,
p_prob_revoke,
raw_score__age_spline__g_vio_hist)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline__g_vio_hist) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline__g_vio_hist) %>% as.matrix()
)
set.seed(7301)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 6
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__g_vio_hist
print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4506"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(Violent~score~-~f[viol~age]~-~g[viol~hist])) +
ylab("XGBoost prediction")+
#annotate("text", x = 0.5, y = 4, label = paste("RMSE:",round(rmse(pred, actual),4)))+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw-fage-gviohist_xgboost_gp3_violent.pdf",width = 3, height = 3, units = "in")
propub = features_filt %>%
filter(filt4) %>% # Only people with valid recidivism values
mutate(age_low = if_else(p_current_age < 25,1,0),
age_high = if_else(p_current_age > 45,1,0),
female = if_else(sex=="Female",1,0),
n_priors = p_felony_count_person + p_misdem_count_person,
compas_high = if_else(`Risk of Violence_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis
print(paste("Number of observations for logistic regression:",nrow(propub)))
## [1] "Number of observations for logistic regression: 13505"
mdl_glm = glm(compas_high ~
female +
age_high +
age_low +
as.factor(race) +
p_charge +
is_misdem +
recid_violent,
family=binomial(link='logit'), data=propub)
summary(mdl_glm)
##
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) +
## p_charge + is_misdem + recid_violent, family = binomial(link = "logit"),
## data = propub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6043 -0.5313 -0.3066 0.7059 3.0149
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.614956 0.065330 -40.027 < 2e-16 ***
## female -0.584530 0.065149 -8.972 < 2e-16 ***
## age_high -1.576448 0.137479 -11.467 < 2e-16 ***
## age_low 2.775651 0.054379 51.043 < 2e-16 ***
## as.factor(race)African-American 0.728572 0.055913 13.030 < 2e-16 ***
## as.factor(race)Asian -0.851084 0.444399 -1.915 0.0555 .
## as.factor(race)Hispanic 0.101528 0.094176 1.078 0.2810
## as.factor(race)Native American 0.412826 0.478234 0.863 0.3880
## as.factor(race)Other 0.019089 0.111656 0.171 0.8643
## p_charge 0.076164 0.003581 21.269 < 2e-16 ***
## is_misdem -0.418826 0.052150 -8.031 9.66e-16 ***
## recid_violent 0.715798 0.066871 10.704 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16544 on 13504 degrees of freedom
## Residual deviance: 10743 on 13493 degrees of freedom
## AIC: 10767
##
## Number of Fisher Scoring iterations: 6